This code visualises the models fitted in 02_fit_models.
library(data.table)
library(dplyr)
Registered S3 methods overwritten by 'tibble':
method from
format.tbl pillar
print.tbl pillar
Attaching package: 'dplyr'
The following objects are masked from 'package:data.table':
between, first, last
The following objects are masked from 'package:stats':
filter, lag
The following objects are masked from 'package:base':
intersect, setdiff, setequal, union
library(purrr)
Attaching package: 'purrr'
The following object is masked from 'package:data.table':
transpose
library(ggplot2)
library(cowplot)
Attaching package: 'cowplot'
The following object is masked from 'package:ggplot2':
ggsave
library(gridExtra)
Attaching package: 'gridExtra'
The following object is masked from 'package:dplyr':
combine
source("00_helper_funs.R")
Load data:
d_f <- fread(file.path("..", "data", "cogpsych_data_formatted.csv"))
Get the last observation per learning sequence:
d_last <- d_f[, .SD[.N], by = id]
setorder(d_last, time_between)
Load fitted models:
lr_tau <- fread(file.path("..", "data", "logistic_regression_tau.csv"))
lr_activation <- fread(file.path("..", "data", "logistic_regression_activation.csv"))
bs_d_indiv <- fread(file.path("..", "data", "binary_search_indiv_d.csv"))
bs_h_indiv <- fread(file.path("..", "data", "binary_search_indiv_h.csv"))
tau_short <- fread(file.path("..", "data", "logistic_regression_tau_short.csv"))
Set default parameters for the memory model:
model_params <- list(
s = .5,
decay = .5,
h = 1
)
Set parameters for splitting the data into windows:
n_windows <- c(1:10, 20, 25, 50, 100)
General function for predicting recall for a given parameter configuration:
predict_recall <- function (seq_list, h = model_params$h, decay = model_params$decay, tau, s = model_params$s, windows = 1) {
pred_correct <- map_dbl(seq_list, function (x) {
seq_d <- ifelse(is.data.table(decay), decay[id == x$id, d], decay)
seq_h <- ifelse(is.data.table(h), h[id == x$id, h], h)
if(is.data.table(tau)) {
if("id" %in% names(tau)) seq_tau <- tau[id == x$id, tau]
if("n_windows" %in% names(tau)) seq_tau <- tau[n_windows == windows & window == x$window, tau]
} else {
seq_tau <- tau
}
ac <- activation(x$time_within, x$time_between, seq_h, seq_d)
p_recall(ac, seq_tau, s)
})
pred <- data.table(id = map_chr(seq_list, ~.$id), pred_correct = pred_correct)
return (pred)
}
Define the time windows for all splits of the data:
window_range <- map_dfr(n_windows, function (n_w) {
d_windows <- copy(d_last)
if (n_w == 1) {
d_windows[, window := 1]
} else {
d_windows[, window := cut(log(time_between), breaks = n_w, labels = FALSE)]
}
# Get the window range(s)
window_range <- d_windows[, .(start = min(time_between), end = max(time_between)), by = .(window)]
window_range[, geom_mean := sqrt(start*end), by = .(window)]
setorder(window_range, window)
window_range[, window := window]
window_range[, n_windows := n_w]
return (window_range)
})
Plot colours:
pred_col <- "#CC3311" # red
obs_col <- "#000000" # black
window_col <- "#33BBEE" # blue
section_col <- "#FD520F" # orange
Interpretable time labels:
label_x <- c(1, 10, 60, 6*60, 24*60, 7*24*60, 7*7*24*60)
label_y <- 1.09
label_txt <- c("1 min", "10 min", "1 h", "6 h", "24 h", "1 wk", "7 wks")
General function for plotting comparison between data and model:
plot_comparison <- function (d_model,
n_w = 1,
label_pos = list(data = list(x = 35000, y = .54),
model = list(x = 35000, y = .46)),
print_plot = TRUE) {
plot_dodge <- function(y, dodge = .1) {
return (y * (1 + dodge) - dodge/2)
}
p <- ggplot() +
# Window background
geom_rect(data = window_range[n_windows == n_w],
aes(xmin = start/60, xmax = ifelse(is.na(shift(start, -1)), end, shift(start, -1))/60,
ymin = -Inf, ymax = Inf, alpha = as.factor(window)),
fill = window_col) +
# Jittered observations along edges
geom_point(data = d_last,
aes(x = time_between/60, y = plot_dodge(correct, .05)),
position = position_jitter(width = 0, height = .025, seed = 123),
colour = "grey20", size = .001, pch = ".", alpha = .1) +
# GAM: data
geom_smooth(data = d_last,
aes(x = time_between/60, y = correct),
method = "gam", formula = y ~ s(x, bs = "cs"),
colour = obs_col, lty = 1, lwd = 1) +
# GAM: model
geom_smooth(data = d_model,
aes(x = time_between/60, y = pred_correct),
method = "gam", formula = y ~ s(x, bs = "cs"),
colour = pred_col, fill = pred_col, lty = 1, lwd = .75) +
# Labels
annotate("text", x = label_pos$data$x, y = label_pos$data$y,
label = "Data", colour = obs_col) +
annotate("text", x = label_pos$model$x, y = label_pos$model$y,
label = "Model", colour = pred_col) +
# Plot setup
scale_x_log10(
breaks = scales::trans_breaks("log10", function(x) 10^x),
labels = scales::trans_format("log10", scales::math_format(10^.x)),
expand = c(0, 0),
sec.axis = sec_axis(~.x, breaks = label_x, labels = label_txt)
) +
scale_y_continuous(breaks = seq(0, 1, by = .25), labels = scales::percent_format()) +
scale_alpha_manual(values = rep(c(.1, .25), ceiling(n_w/2))) +
guides(colour = "none",
alpha = "none") +
labs(x = "Between-session interval (minutes)",
y = "Response accuracy") +
annotation_logticks(sides = "b", outside = T) +
coord_cartesian(ylim = c(0, 1), xlim = c(window_range[1, start], window_range[.N, end])/60, clip = "off") +
theme_bw(base_size = 14) +
theme(plot.margin = margin(7, 14, 7, 7),
panel.grid.major.x = element_blank(),
panel.grid.minor = element_blank(),
panel.border = element_blank())
if (print_plot) print(p)
return (p)
}
General function for plotting parameter change over time:
plot_parameter <- function(d_parameter,
parameter_name = "",
n_w = 1,
log_x = TRUE,
log_y = FALSE,
print_plot = TRUE) {
# Calculate R-squared
x <- d_parameter[n_windows == n_w, geom_mean]/60
y <- d_parameter[n_windows == n_w, parameter]
if (log_x) x <- log(x)
if (log_y) y <- log(y)
m <- lm(y ~ x)
eq <- substitute(parameter_name == a-b %*% ln(italic(t)),
list(parameter_name = parameter_name,
a = format(unname(coef(m)[1]), digits = 3),
b = format(abs(unname(coef(m)[2])), digits = 3)))
eq <- as.character(as.expression(eq))
rsq <- paste("R^2 ==", scales::number(summary(m)$r.squared, accuracy = .01))
p <- ggplot() +
# Window background
geom_rect(data = window_range[n_windows == n_w],
aes(xmin = start/60, xmax = ifelse(is.na(shift(start, -1)), end, shift(start, -1))/60,
ymin = ifelse(log_y, 0, -Inf), ymax = Inf, alpha = as.factor(window)),
fill = window_col) +
# Regression line
geom_smooth(data = d_parameter[n_windows == n_w],
aes(y = parameter, x = geom_mean/60),
method = "lm", formula = y ~ x,
colour = pred_col, fill = pred_col) +
# Parameter values
geom_point(data = d_parameter[n_windows == n_w],
aes(y = parameter, x = geom_mean/60)) +
scale_alpha_manual(values = rep(c(.1, .25), ceiling(n_w/2))) +
# R-squared
geom_label(aes(x = Inf, y = Inf, label = rsq),
label.padding = unit(.5, "lines"),
label.size = NA,
fill = NA,
hjust = "inward", vjust = "inward",
parse = TRUE) +
geom_label(aes(x = ifelse(log_x, 0, -Inf), y = ifelse(log_y, 0, -Inf), label = eq),
label.padding = unit(.5, "lines"),
label.size = NA,
fill = NA,
hjust = "inward", vjust = "inward",
parse = TRUE) +
# Plot setup
guides(alpha = "none") +
labs(x = "Between-session interval (minutes)",
y = "Fitted parameter") +
scale_x_continuous(sec.axis = sec_axis(~.x, breaks = label_x, labels = label_txt)) +
coord_cartesian(xlim = c(window_range[1, start], window_range[.N, end])/60,
ylim = c(min(y) - .1*diff(range(y)), max(y) + .1*diff(range(y))),
clip = "off") +
theme_bw(base_size = 14) +
theme(plot.margin = margin(7, 14, 7, 7),
panel.grid.major.x = element_blank(),
panel.grid.minor = element_blank(),
panel.border = element_blank())
# Transform scales if required
if (log_x) {
p <- p +
scale_x_log10(
breaks = scales::trans_breaks("log10", function(x) 10^x),
labels = scales::trans_format("log10", scales::math_format(10^.x)),
expand = c(0, 0),
sec.axis = sec_axis(~.x, breaks = label_x, labels = label_txt)
) +
annotation_logticks(sides = "b", outside = T)
}
if (log_y) {
p <- p +
scale_y_log10() +
annotation_logticks(sides = "l", outside = T)
}
if (print_plot) print(p)
return (p)
}
What does predicted retention look like if we use a single parameter configuration fitted across the whole range of intervals?
Divide the data into learning sequences for plotting:
d_single <- copy(d_last)
d_single[, sequence := 1:.N]
d_single <- d_f[d_single[, .(id, sequence)], on = .(id)]
d_single[, window := 1]
d_seq_list_single <- generate_seq_list(d_single)
single_tau <- lr_tau[n_windows == 1, tau]
The best-fitting threshold for the entire range of intervals is -4.6072046. If we consistently use this threshold in the model, it initially overpredicts retention (indicating that the threshold is too low relative to the activation), gets the prediction right around the mode of the interval distribution, and eventually underpredicts retention (indicating that the threshold is too high).
pred_single_tau <- predict_recall(seq_list = d_seq_list_single, tau = single_tau)
d_single_tau <- d_last[pred_single_tau, on = .(id)]
p_single_tau <- plot_comparison(d_model = d_single_tau,
n_w = 1,
label_pos = list(data = list(x = 35000, y = .5),
model = list(x = 7000, y = .29)))
We did not determine the decay directly, but derived it from the best-fitting activation. Across the whole data set, the best-fitting activation is -4.1130682. We determined the optimal decay for each item, such that it ended up at this activation. Since activation (and the threshold) are constant across the range, the predicted recall is also simply the best-fitting horizontal line through the data.
pred_single_d <- predict_recall(d_seq_list_single,
decay = bs_d_indiv[n_windows == 1],
tau = lr_activation[n_windows == 1, tau])
d_single_d <- d_last[pred_single_d, on = .(id)]
p_single_d <- plot_comparison(d_model = d_single_d,
n_w = 1,
label_pos = list(data = list(x = 35000, y = .5),
model = list(x = 35000, y = .78)))
Warning in newton(lsp = lsp, X = G$X, y = G$y, Eb = G$Eb, UrS = G$UrS, L =
G$L, : Fitting terminated with step failure - check results carefully
Like the decay, the optimal scaling factor was derived from the best-fitting activation. In principle this should again produce a horizontal line. However, getting this activation value would require scaling by a factor larger than 1 for a large portion of the data. Since h is limited to 1, we cannot get the activation low enough in these cases, and so we overpredict retention.
pred_single_h <- predict_recall(d_seq_list_single,
h = bs_h_indiv[n_windows == 1],
tau = lr_activation[n_windows == 1, tau])
d_single_h <- d_last[pred_single_h, on = .(id)]
p_single_h <- plot_comparison(d_model = d_single_h,
n_w = 1,
label_pos = list(data = list(x = 35000, y = .5),
model = list(x = 35000, y = .78)))
Rather than determining the best parameter fit from the whole data set, we also explore a scenario in which we only have intervals around 24 hours, which is more similar to many controlled experiments.
We use a subset of the data with intervals around 24 hours. In a 25-window fit this happens to fall quite neatly within window number 17, so we'll use that.
ggplot(window_range[n_windows == 25]) +
geom_vline(aes(xintercept = 24*60*60), colour = "red", lty = 2) +
geom_hline(aes(yintercept = 17), colour = "red", lty = 2) +
geom_segment(aes(x = start, xend = end, y = window, yend = window)) +
geom_point(aes(x = geom_mean, y = window)) +
geom_text(aes(x = geom_mean, y = window + .5, label = window), colour = "blue") +
scale_x_log10(
breaks = scales::trans_breaks("log10", function(x) 10^x),
labels = scales::trans_format("log10", scales::math_format(10^.x)),
expand = c(0, 0)
) +
labs(x = "Between-session interval (minutes)",
y = "Window") +
annotation_logticks(sides = "b", outside = T) +
coord_cartesian(clip = "off")
tau_24h <- lr_tau[n_windows == 25 & window == 17, tau]
The optimal threshold for the 24h interval is -4.702159, quite similar to the optimal threshold for the whole data set. The model's predictions also look very similar.
pred_tau_24h <- predict_recall(seq_list = d_seq_list_single, tau = tau_24h)
d_tau_24h <- d_last[pred_tau_24h, on = .(id)]
p_tau_24h <- plot_comparison(d_model = d_tau_24h,
n_w = 1,
label_pos = list(data = list(x = 35000, y = .5),
model = list(x = 5000, y = .29))) +
geom_rect(data = window_range[n_windows == 25 & window == 17], aes(xmin = start/60, xmax = end/60, ymin = -0.05, ymax = 1.05), fill = section_col, alpha = .25)
p_tau_24h
Because optimal decay is derived separately for each individual learning sequence, there is not a single decay value associated with the 24h interval, and we only know the best decay for the individual sequences that fall within the 24h set.
d_24h <- median(bs_d_indiv[n_windows == 25 & window == 17, d])
To get a parameter that we can use for general prediction, we'll take the median: d = 0.2847328.
ggplot(bs_d_indiv[n_windows == 25 & window == 17], aes(x = d)) +
geom_histogram(binwidth = .005, fill = "midnightblue") +
geom_vline(aes(xintercept = d_24h), colour = "red", lty = 2)
These predictions show the typical pattern: the model initially overpredicts recall, gets it right around 24h, and eventually underpredicts recall.
pred_d_24h <- predict_recall(d_seq_list_single,
decay = d_24h,
tau = lr_activation[n_windows == 25 & window == 17, tau])
d_d_24h <- d_last[pred_d_24h, on = .(id)]
p_d_24h <- plot_comparison(d_model = d_d_24h,
n_w = 1,
label_pos = list(data = list(x = 35000, y = .5),
model = list(x = 5000, y = .78)),
print_plot = FALSE) +
geom_rect(data = window_range[n_windows == 25 & window == 17], aes(xmin = start/60, xmax = end/60, ymin = -0.05, ymax = 1.05), fill = section_col, alpha = .25)
p_d_24h
h_24h <- median(bs_h_indiv[n_windows == 25 & window == 17, h])
As with decay, we'll take the median scaling factor from the 24h set: h = 0.0029473.
ggplot(bs_h_indiv[n_windows == 25 & window == 17], aes(x = h)) +
geom_histogram(binwidth = .01, fill = "midnightblue") +
geom_vline(aes(xintercept = h_24h), colour = "red", lty = 2)
This model already does surprisingly well: short-term predictions are good, though the model is too optimistic on intervals of 1-24 hours, and too pessimistic on intervals longer than a week.
pred_h_24h <- predict_recall(d_seq_list_single,
h = h_24h,
tau = lr_activation[n_windows == 25 & window == 17, tau])
d_h_24h <- d_last[pred_h_24h, on = .(id)]
p_h_24h <- plot_comparison(d_model = d_h_24h,
n_w = 1,
label_pos = list(data = list(x = 35000, y = .5),
model = list(x = 12000, y = .2)),
print_plot = FALSE) +
geom_rect(data = window_range[n_windows == 25 & window == 17], aes(xmin = start/60, xmax = end/60, ymin = -0.05, ymax = 1.05), fill = section_col, alpha = .25)
p_h_24h
Finally, we'll look at a model with default parameters other than the threshold, which is fitted to short intervals (0-10 minutes) only. This model clearly illustrates the problem of underpredicting retention between sessions when we extrapolate from a single-session fit.
pred_tau_short <- predict_recall(d_seq_list_single,
tau = tau_short$tau_short)
d_tau_short <- d_last[pred_tau_short, on = .(id)]
p_tau_short <- plot_comparison(d_model = d_tau_short,
n_w = 1,
label_pos = list(data = list(x = 35000, y = .5),
model = list(x = 35000, y = .08)),
print_plot = FALSE) +
geom_rect(aes(xmin = min(window_range$start)/60, xmax = 10, ymin = -0.05, ymax = 1.05), fill = section_col, alpha = .25)
p_tau_short
Rather than using a single parameter configuration across the range, we can also fit parameters separately per time bin. Here we'll look at the results when the data are split into 20 bins, each containing 5% of the total.
Prepare the data:
d_20 <- copy(d_last)
d_20 <- d_20[, sequence := 1:.N][, .(id, sequence, time_between, window = cut(log(time_between), breaks = 20, labels = FALSE))]
d_20_seq <- generate_seq_list(d_20[d_f, on = .(id)])
pred_tau_20 <- predict_recall(seq_list = d_20_seq, tau = lr_tau, windows = 20)
d_tau_20 <- d_last[pred_tau_20, on = .(id)]
p_tau_20 <- plot_comparison(d_model = d_tau_20,
n_w = 20,
label_pos = list(data = list(x = 35000, y = .5),
model = list(x = 20000, y = .35)))
Plotting the optimal parameter per time bin shows a systematic change over time (note the log-transformed x-axis):
lr_tau_viz <- lr_tau[window_range, on = .(n_windows, window)]
setnames(lr_tau_viz, "tau", "parameter")
p_tau_time <- plot_parameter(d_parameter = lr_tau_viz,
parameter_name = quote(tau),
n_w = 20)
Scale for 'x' is already present. Adding another scale for 'x', which
will replace the existing scale.
Warning: Transformation introduced infinite values in continuous x-axis
How well does the model fit if we use the fitted LM to set the threshold?
m_tau <- lm(parameter ~ log(geom_mean), data = lr_tau_viz[n_windows == 20])
d_tau_lm <- map_dfr(d_20_seq, function(x) {
list(id = x$id, geom_mean = x$time_between)
})
d_tau_lm$tau <- predict(m_tau, newdata = d_tau_lm)
setDT(d_tau_lm)
pred_tau_lm <- predict_recall(seq_list = d_20_seq, tau = d_tau_lm)
d_tau_lm <- d_last[pred_tau_lm, on = .(id)]
p_tau_lm <- plot_comparison(d_model = d_tau_lm,
n_w = 20,
label_pos = list(data = list(x = 35000, y = .5),
model = list(x = 20000, y = .29)))
The figure below shows the optimal model parameter for different numbers of bins, and confirms that the 20-bin split is not qualitatively different from other comparable splits.
p_tau_n <- map(n_windows, function(n_w) {
p_n <- plot_parameter(d_parameter = lr_tau_viz,
parameter_name = quote(tau),
n_w = n_w,
print_plot = FALSE) +
labs(x = "Interval",
y = "Parameter",
title = paste("Bins:", n_w))
p_n
})
Scale for 'x' is already present. Adding another scale for 'x', which
will replace the existing scale.
Scale for 'x' is already present. Adding another scale for 'x', which
will replace the existing scale.
Scale for 'x' is already present. Adding another scale for 'x', which
will replace the existing scale.
Scale for 'x' is already present. Adding another scale for 'x', which
will replace the existing scale.
Scale for 'x' is already present. Adding another scale for 'x', which
will replace the existing scale.
Scale for 'x' is already present. Adding another scale for 'x', which
will replace the existing scale.
Scale for 'x' is already present. Adding another scale for 'x', which
will replace the existing scale.
Scale for 'x' is already present. Adding another scale for 'x', which
will replace the existing scale.
Scale for 'x' is already present. Adding another scale for 'x', which
will replace the existing scale.
Scale for 'x' is already present. Adding another scale for 'x', which
will replace the existing scale.
Scale for 'x' is already present. Adding another scale for 'x', which
will replace the existing scale.
Scale for 'x' is already present. Adding another scale for 'x', which
will replace the existing scale.
Scale for 'x' is already present. Adding another scale for 'x', which
will replace the existing scale.
Scale for 'x' is already present. Adding another scale for 'x', which
will replace the existing scale.
do.call("grid.arrange", c(p_tau_n, ncol = 4))
Warning: Transformation introduced infinite values in continuous x-axis
Warning: Transformation introduced infinite values in continuous x-axis
Warning in qt((1 - level)/2, df): NaNs produced
Warning in max(ids, na.rm = TRUE): no non-missing arguments to max;
returning -Inf
Warning: Transformation introduced infinite values in continuous x-axis
Warning: Transformation introduced infinite values in continuous x-axis
Warning: Transformation introduced infinite values in continuous x-axis
Warning: Transformation introduced infinite values in continuous x-axis
Warning: Transformation introduced infinite values in continuous x-axis
Warning: Transformation introduced infinite values in continuous x-axis
Warning: Transformation introduced infinite values in continuous x-axis
Warning: Transformation introduced infinite values in continuous x-axis
Warning: Transformation introduced infinite values in continuous x-axis
Warning: Transformation introduced infinite values in continuous x-axis
Warning: Transformation introduced infinite values in continuous x-axis
Warning: Transformation introduced infinite values in continuous x-axis
Here we keep the threshold fixed but find the optimal decay parameter in each of the 20 bins.
pred_d_20 <- predict_recall(seq_list = d_20_seq,
decay = bs_d_indiv[n_windows == 20],
tau = lr_activation,
windows = 20)
d_d_20 <- d_last[pred_d_20, on = .(id)]
p_d_20 <- plot_comparison(d_model = d_d_20,
n_w = 20,
label_pos = list(data = list(x = 35000, y = .5),
model = list(x = 20000, y = .35)))
The decay parameter also changes over time in a systematic fashion (note the log-transformed x-axis). Interestingly, ACT-R's default value for this parameter is 0.5, which matches what we find for intervals up to a few minutes in length.
bs_d_viz <- bs_d_indiv[window_range, on = .(n_windows, window)][, .(d = median(d)), by = .(n_windows, window, start, end, geom_mean)]
setnames(bs_d_viz, "d", "parameter")
p_d_time <- plot_parameter(d_parameter = bs_d_viz,
parameter_name = quote(italic(d)),
n_w = 20)
Scale for 'x' is already present. Adding another scale for 'x', which
will replace the existing scale.
Warning: Transformation introduced infinite values in continuous x-axis
Model predictions when the decay parameter is determined by the fitted LM:
m_d <- lm(parameter ~ log(geom_mean), data = bs_d_viz[n_windows == 20])
d_d_lm <- map_dfr(d_20_seq, function(x) {
list(id = x$id, geom_mean = x$time_between)
})
d_d_lm$d <- predict(m_d, newdata = d_d_lm)
setDT(d_d_lm)
pred_d_lm <- predict_recall(seq_list = d_20_seq, decay = d_d_lm, tau = lr_activation, windows = 20)
d_d_lm <- d_last[pred_d_lm, on = .(id)]
p_d_lm <- plot_comparison(d_model = d_d_lm,
n_w = 20,
label_pos = list(data = list(x = 35000, y = .5),
model = list(x = 30000, y = .64)))
The figure below shows the optimal model parameter for different numbers of bins, and confirms that the 20-bin split is not qualitatively different from other comparable splits.
p_d_n <- map(n_windows, function(n_w) {
p_n <- plot_parameter(d_parameter = bs_d_viz,
parameter_name = quote(italic(d)),
n_w = n_w,
print_plot = FALSE) +
labs(x = "Interval",
y = "Parameter",
title = paste("Bins:", n_w))
p_n
})
Scale for 'x' is already present. Adding another scale for 'x', which
will replace the existing scale.
Scale for 'x' is already present. Adding another scale for 'x', which
will replace the existing scale.
Scale for 'x' is already present. Adding another scale for 'x', which
will replace the existing scale.
Scale for 'x' is already present. Adding another scale for 'x', which
will replace the existing scale.
Scale for 'x' is already present. Adding another scale for 'x', which
will replace the existing scale.
Scale for 'x' is already present. Adding another scale for 'x', which
will replace the existing scale.
Scale for 'x' is already present. Adding another scale for 'x', which
will replace the existing scale.
Scale for 'x' is already present. Adding another scale for 'x', which
will replace the existing scale.
Scale for 'x' is already present. Adding another scale for 'x', which
will replace the existing scale.
Scale for 'x' is already present. Adding another scale for 'x', which
will replace the existing scale.
Scale for 'x' is already present. Adding another scale for 'x', which
will replace the existing scale.
Scale for 'x' is already present. Adding another scale for 'x', which
will replace the existing scale.
Scale for 'x' is already present. Adding another scale for 'x', which
will replace the existing scale.
Scale for 'x' is already present. Adding another scale for 'x', which
will replace the existing scale.
do.call("grid.arrange", c(p_d_n, ncol = 4))
Warning: Transformation introduced infinite values in continuous x-axis
Warning: Transformation introduced infinite values in continuous x-axis
Warning in qt((1 - level)/2, df): NaNs produced
Warning in max(ids, na.rm = TRUE): no non-missing arguments to max;
returning -Inf
Warning: Transformation introduced infinite values in continuous x-axis
Warning: Transformation introduced infinite values in continuous x-axis
Warning: Transformation introduced infinite values in continuous x-axis
Warning: Transformation introduced infinite values in continuous x-axis
Warning: Transformation introduced infinite values in continuous x-axis
Warning: Transformation introduced infinite values in continuous x-axis
Warning: Transformation introduced infinite values in continuous x-axis
Warning: Transformation introduced infinite values in continuous x-axis
Warning: Transformation introduced infinite values in continuous x-axis
Warning: Transformation introduced infinite values in continuous x-axis
Warning: Transformation introduced infinite values in continuous x-axis
Warning: Transformation introduced infinite values in continuous x-axis
Here the scaling factor is fitted separately on each time bin, while the threshold and decay stay constant.
pred_h_20 <- predict_recall(seq_list = d_20_seq,
h = bs_h_indiv[n_windows == 20],
tau = lr_activation,
windows = 20)
d_h_20 <- d_last[pred_h_20, on = .(id)]
p_h_20 <- plot_comparison(d_model = d_h_20,
n_w = 20,
label_pos = list(data = list(x = 35000, y = .5),
model = list(x = 20000, y = .35)))
Once again, there is a consistent change in the parameter over time (note that here both axes are log-transformed):
bs_h_viz <- bs_h_indiv[window_range, on = .(n_windows, window)][, .(h = median(h)), by = .(n_windows, window, start, end, geom_mean)]
setnames(bs_h_viz, "h", "parameter")
p_h_time <- plot_parameter(d_parameter = bs_h_viz,
parameter_name = quote(ln(italic(h))),
log_y = TRUE,
n_w = 20,
print_plot = FALSE) +
coord_cartesian(xlim = c(window_range[1, start], window_range[.N, end])/60, ylim = c(.001, 1.2), clip = "off") +
theme(axis.text.y = element_text(margin = margin(r = 8)))
Scale for 'x' is already present. Adding another scale for 'x', which
will replace the existing scale.
Coordinate system already present. Adding new coordinate system, which will replace the existing one.
p_h_time
Warning: Transformation introduced infinite values in continuous y-axis
Warning: Transformation introduced infinite values in continuous x-axis
Warning: Transformation introduced infinite values in continuous y-axis
Model predictions when the scaling factor is determined by the fitted LM:
m_h <- lm(log(parameter) ~ log(geom_mean), data = bs_h_viz[n_windows == 20])
d_h_lm <- map_dfr(d_20_seq, function(x) {
list(id = x$id, geom_mean = x$time_between)
})
d_h_lm$h <- exp(predict(m_h, newdata = d_h_lm))
setDT(d_h_lm)
pred_h_lm <- predict_recall(seq_list = d_20_seq, h = d_h_lm, tau = lr_activation, windows = 20)
d_h_lm <- d_last[pred_h_lm, on = .(id)]
p_h_lm <- plot_comparison(d_model = d_h_lm,
n_w = 20,
label_pos = list(data = list(x = 35000, y = .5),
model = list(x = 30000, y = .64)))
The figure below shows the optimal model parameter for different numbers of bins, and confirms that the 20-bin split is not qualitatively different from other comparable splits, at least those with fewer bins. Splits with more bins seem to suffer from excessive noise.
p_h_n <- map(n_windows, function(n_w) {
p_n <- plot_parameter(d_parameter = bs_h_viz,
parameter_name = quote(ln(italic(h))),
log_y = TRUE,
n_w = n_w,
print_plot = FALSE) +
coord_cartesian(xlim = c(window_range[1, start], window_range[.N, end])/60, ylim = c(.001, 1.2), clip = "on") +
theme(axis.text.y = element_text(margin = margin(r = 8))) +
labs(x = "Interval",
y = "Parameter",
title = paste("Bins:", n_w))
p_n
})
Scale for 'x' is already present. Adding another scale for 'x', which
will replace the existing scale.
Coordinate system already present. Adding new coordinate system, which will replace the existing one.
Scale for 'x' is already present. Adding another scale for 'x', which
will replace the existing scale.
Coordinate system already present. Adding new coordinate system, which will replace the existing one.
Scale for 'x' is already present. Adding another scale for 'x', which
will replace the existing scale.
Coordinate system already present. Adding new coordinate system, which will replace the existing one.
Scale for 'x' is already present. Adding another scale for 'x', which
will replace the existing scale.
Coordinate system already present. Adding new coordinate system, which will replace the existing one.
Scale for 'x' is already present. Adding another scale for 'x', which
will replace the existing scale.
Coordinate system already present. Adding new coordinate system, which will replace the existing one.
Scale for 'x' is already present. Adding another scale for 'x', which
will replace the existing scale.
Coordinate system already present. Adding new coordinate system, which will replace the existing one.
Scale for 'x' is already present. Adding another scale for 'x', which
will replace the existing scale.
Coordinate system already present. Adding new coordinate system, which will replace the existing one.
Scale for 'x' is already present. Adding another scale for 'x', which
will replace the existing scale.
Coordinate system already present. Adding new coordinate system, which will replace the existing one.
Scale for 'x' is already present. Adding another scale for 'x', which
will replace the existing scale.
Coordinate system already present. Adding new coordinate system, which will replace the existing one.
Scale for 'x' is already present. Adding another scale for 'x', which
will replace the existing scale.
Coordinate system already present. Adding new coordinate system, which will replace the existing one.
Scale for 'x' is already present. Adding another scale for 'x', which
will replace the existing scale.
Coordinate system already present. Adding new coordinate system, which will replace the existing one.
Scale for 'x' is already present. Adding another scale for 'x', which
will replace the existing scale.
Coordinate system already present. Adding new coordinate system, which will replace the existing one.
Scale for 'x' is already present. Adding another scale for 'x', which
will replace the existing scale.
Coordinate system already present. Adding new coordinate system, which will replace the existing one.
Scale for 'x' is already present. Adding another scale for 'x', which
will replace the existing scale.
Coordinate system already present. Adding new coordinate system, which will replace the existing one.
do.call("grid.arrange", c(p_h_n, ncol = 4))
Warning: Transformation introduced infinite values in continuous y-axis
Warning: Transformation introduced infinite values in continuous x-axis
Warning: Transformation introduced infinite values in continuous y-axis
Warning: Transformation introduced infinite values in continuous y-axis
Warning: Transformation introduced infinite values in continuous x-axis
Warning: Transformation introduced infinite values in continuous y-axis
Warning in qt((1 - level)/2, df): NaNs produced
Warning in max(ids, na.rm = TRUE): no non-missing arguments to max;
returning -Inf
Warning: Transformation introduced infinite values in continuous y-axis
Warning: Transformation introduced infinite values in continuous x-axis
Warning: Transformation introduced infinite values in continuous y-axis
Warning: Transformation introduced infinite values in continuous y-axis
Warning: Transformation introduced infinite values in continuous x-axis
Warning: Transformation introduced infinite values in continuous y-axis
Warning: Transformation introduced infinite values in continuous y-axis
Warning: Transformation introduced infinite values in continuous x-axis
Warning: Transformation introduced infinite values in continuous y-axis
Warning: Transformation introduced infinite values in continuous y-axis
Warning: Transformation introduced infinite values in continuous x-axis
Warning: Transformation introduced infinite values in continuous y-axis
Warning: Transformation introduced infinite values in continuous y-axis
Warning: Transformation introduced infinite values in continuous x-axis
Warning: Transformation introduced infinite values in continuous y-axis
Warning: Transformation introduced infinite values in continuous y-axis
Warning: Transformation introduced infinite values in continuous x-axis
Warning: Transformation introduced infinite values in continuous y-axis
Warning: Transformation introduced infinite values in continuous y-axis
Warning: Transformation introduced infinite values in continuous x-axis
Warning: Transformation introduced infinite values in continuous y-axis
Warning: Transformation introduced infinite values in continuous y-axis
Warning: Transformation introduced infinite values in continuous x-axis
Warning: Transformation introduced infinite values in continuous y-axis
Warning: Transformation introduced infinite values in continuous y-axis
Warning: Transformation introduced infinite values in continuous x-axis
Warning: Transformation introduced infinite values in continuous y-axis
Warning: Transformation introduced infinite values in continuous y-axis
Warning: Transformation introduced infinite values in continuous x-axis
Warning: Transformation introduced infinite values in continuous y-axis
Warning: Transformation introduced infinite values in continuous y-axis
Warning: Transformation introduced infinite values in continuous x-axis
Warning: Transformation introduced infinite values in continuous y-axis
Warning: Transformation introduced infinite values in continuous y-axis
Warning: Transformation introduced infinite values in continuous x-axis
Warning: Transformation introduced infinite values in continuous y-axis
A histogram of the between-session intervals in the data.
p_histogram <- ggplot() +
# Window background
geom_rect(data = window_range[n_windows == 1], aes(xmin = start/60, xmax = end/60, ymin = -Inf, ymax = Inf), fill = window_col, alpha = .1) +
# Histogram
geom_histogram(data = d_last, aes(x = time_between/60, y = ..ncount..), bins = 100, fill = obs_col) +
# Plot setup
scale_x_log10(
breaks = scales::trans_breaks("log10", function(x) 10^x),
labels = scales::trans_format("log10", scales::math_format(10^.x)),
expand = c(0, 0),
sec.axis = sec_axis(~.x, breaks = label_x, labels = label_txt)
) +
scale_y_continuous(breaks = seq(0, 1, by = .25)) +
labs(x = "Between-session interval (minutes)",
y = "Density") +
annotation_logticks(sides = "b", outside = T) +
coord_cartesian(ylim = c(0, 1), xlim = c(window_range[1, start], window_range[.N, end])/60, clip = "off") +
theme_bw(base_size = 14) +
theme(plot.margin = margin(7, 14, 7, 7),
panel.grid.major.x = element_blank(),
panel.grid.minor = element_blank(),
panel.border = element_blank())
p_histogram
plot_grid(
p_histogram,
p_tau_short,
p_tau_24h,
p_tau_20,
p_d_24h,
p_d_20,
p_h_24h,
p_h_20,
ncol = 2,
labels = c("a\t\tInterval distribution", "b\t\tThreshold optimised for 0 - 10 min", "c\t\tThreshold optimised for 24h", "d\t\tInterval-dependent threshold", "e\t\tDecay optimised for 24h", "f\t\tInterval-dependent decay", "g\t\tScaling factor optimised for 24h", "h\t\tInterval-dependent scaling factor"),
align = "hv",
label_x = .025,
hjust = 0,
scale = .9
) +
theme(plot.background = element_rect(fill = "white", colour = NA))
ggsave(file.path("..", "output", "model_fitting_results.png"), width = 10, height = 15)
plot_grid(
p_tau_time, p_d_time, p_h_time,
ncol = 3,
labels = c("a\t\tInterval-dependent threshold", "b\t\tInterval-dependent decay", "c\t\tInterval-dependent h"),
align = "hv",
label_x = .025,
hjust = 0,
scale = .9
) +
theme(plot.background = element_rect(fill = "white", colour = NA))
Warning: Transformation introduced infinite values in continuous x-axis
Warning: Transformation introduced infinite values in continuous x-axis
Warning: Transformation introduced infinite values in continuous y-axis
Warning: Transformation introduced infinite values in continuous x-axis
Warning: Transformation introduced infinite values in continuous y-axis
ggsave(file.path("..", "output", "params_time.png"), width = 12, height = 4)
sessionInfo()
R version 3.6.3 (2020-02-29)
Platform: x86_64-pc-linux-gnu (64-bit)
Running under: Ubuntu 18.04.6 LTS
Matrix products: default
BLAS: /usr/lib/x86_64-linux-gnu/blas/libblas.so.3.7.1
LAPACK: /usr/lib/x86_64-linux-gnu/lapack/liblapack.so.3.7.1
locale:
[1] LC_CTYPE=en_US.UTF-8 LC_NUMERIC=C
[3] LC_TIME=en_GB.UTF-8 LC_COLLATE=en_US.UTF-8
[5] LC_MONETARY=en_GB.UTF-8 LC_MESSAGES=en_US.UTF-8
[7] LC_PAPER=en_GB.UTF-8 LC_NAME=C
[9] LC_ADDRESS=C LC_TELEPHONE=C
[11] LC_MEASUREMENT=en_GB.UTF-8 LC_IDENTIFICATION=C
attached base packages:
[1] stats graphics grDevices utils datasets methods base
other attached packages:
[1] gridExtra_2.3 cowplot_0.9.4 ggplot2_3.3.5 purrr_0.3.2
[5] dplyr_1.0.7 data.table_1.13.6
loaded via a namespace (and not attached):
[1] Rcpp_1.0.6 pillar_1.6.3 compiler_3.6.3 tools_3.6.3
[5] digest_0.6.19 lattice_0.20-41 nlme_3.1-149 jsonlite_1.6
[9] evaluate_0.14 lifecycle_1.0.1 tibble_2.1.3 gtable_0.3.0
[13] mgcv_1.8-28 pkgconfig_2.0.2 rlang_0.4.10 Matrix_1.2-18
[17] DBI_1.1.0 yaml_2.2.0 xfun_0.21 withr_2.3.0
[21] stringr_1.4.0 knitr_1.23 generics_0.1.0 vctrs_0.3.8
[25] grid_3.6.3 tidyselect_1.1.1 glue_1.4.2 R6_2.4.0
[29] fansi_0.4.0 rmarkdown_2.6 farver_2.1.0 magrittr_2.0.1
[33] splines_3.6.3 scales_1.1.1 ellipsis_0.3.2 htmltools_0.3.6
[37] colorspace_1.4-1 labeling_0.3 utf8_1.1.4 stringi_1.4.3
[41] munsell_0.5.0 crayon_1.4.1